# Import libraries
import re
import sys
from hashlib import sha1
from pandas_profiling import ProfileReport
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
# train test split and cross validation
from sklearn.model_selection import (
train_test_split,
)
imdb_df = pd.read_csv("https://raw.githubusercontent.com/nproellochs/SentimentDictionaries/master/Dataset_IMDB.csv")
imdb_df
| Id | Text | Author | Rating | |
|---|---|---|---|---|
| 0 | 29420 | In my opinion, a movie reviewer's most importa... | Schwartz | 0.1 |
| 1 | 17219 | STARSHIP TROOPERS (director: Paul Verhoeven; c... | Schwartz | 0.2 |
| 2 | 18406 | THE SCHOOL OF FLESH (ECOLE DE lA CHAIR, L') (D... | Schwartz | 0.2 |
| 3 | 18648 | LOCK, STOCK AND TWO SMOKING BARRELS (director:... | Schwartz | 0.2 |
| 4 | 20021 | RUN LOLA RUN (LOLA RENNT)(director/writer: Tom... | Schwartz | 0.2 |
| ... | ... | ... | ... | ... |
| 5001 | 7470 | The conventional wisdom is that movie sequels ... | Rhodes | 0.9 |
| 5002 | 7853 | Nicolas Roeg's mesmerizing 1971 film WALKABOUT... | Rhodes | 0.9 |
| 5003 | 8309 | The movie AIR FORCE ONE should require a docto... | Rhodes | 0.9 |
| 5004 | 8912 | "Well, Jones, at least you haven't forgotten h... | Rhodes | 0.9 |
| 5005 | 9085 | In a time of bloated productions where special... | Rhodes | 0.9 |
5006 rows × 4 columns
Split the data into train_df (80%) and test_df (20%).
train_df, test_df = train_test_split(imdb_df, test_size=0.2, random_state=123)
profile = ProfileReport(train_df, title='Pandas Profiling Report') #, minimal=True)
profile.to_notebook_iframe()
pd.DataFrame(train_df['Rating'].value_counts())
| Rating | |
|---|---|
| 0.70 | 688 |
| 0.50 | 640 |
| 0.60 | 637 |
| 0.40 | 484 |
| 0.80 | 387 |
| ... | ... |
| 0.14 | 1 |
| 0.32 | 1 |
| 0.22 | 1 |
| 0.17 | 1 |
| 0.99 | 1 |
86 rows × 1 columns
train_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4004 entries, 4715 to 3582 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 4004 non-null int64 1 Text 4004 non-null object 2 Author 4004 non-null object 3 Rating 4004 non-null float64 dtypes: float64(1), int64(1), object(2) memory usage: 156.4+ KB
| Column Name | Column Type | Description |
|---|---|---|
| Id | Numeric | Unique ID assigned to each observation. |
| Text | Free Text | Body of the review content. |
| Author | Categorical | Author's name of the review |
| Rating | Numeric | Ratings given along with the review |
drop_features = ['id', 'Author']
train_df.describe(include="all")
| Id | Text | Author | Rating | |
|---|---|---|---|---|
| count | 4004.000000 | 4004 | 4004 | 4004.000000 |
| unique | NaN | 3978 | 4 | NaN |
| top | NaN | "When I grow up and get married, I'm going to ... | Rhodes | NaN |
| freq | NaN | 2 | 1415 | NaN |
| mean | 13887.965035 | NaN | NaN | 0.580564 |
| std | 9332.642525 | NaN | NaN | 0.181505 |
| min | 1858.000000 | NaN | NaN | 0.000000 |
| 25% | 5404.250000 | NaN | NaN | 0.450000 |
| 50% | 11211.000000 | NaN | NaN | 0.600000 |
| 75% | 23263.250000 | NaN | NaN | 0.700000 |
| max | 29866.000000 | NaN | NaN | 1.000000 |
alt.Chart(train_df).mark_bar().encode(
x=alt.X('Rating', bin=True, title='Rating'),
y='count()')